Mateusz Biesiadowski mb406097
$f(x_1, x_2) = (x_1 + x_2)^2$
$X_1, X_2$ ~ $U_{[-1,1]}$
$X_1 = X_2$
We can conclude that:
$E[X] = 0$ and $E[X^2] = \frac{1}{3}$
PDP:
$g_{PD}^1(z) = E_{X_2}[(z + x_2)^2]$ = $z^2 + 2z E_{X_2}[x_2] + E_{X_2}[x_2^2]$ = $z^2 + \frac{1}{3}$
Ceteris Paribus
Partial Dependence Plots
Partial Dependence Plots 2
Random forest classifier vs Decision tree classifier
!pip install dalex 1> /dev/null
!pip install shap 1> /dev/null
!pip install lime 1> /dev/null
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lime
import shap
import dalex as dx
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#@title Set Credentials
kaggle_username = '' #@param {type:"string"}
kaggle_api_key = '' #@param {type:"string"}
try:
import kaggle
except OSError:
with open("/root/.kaggle/kaggle.json", "w+") as f:
f.write('{"username":"' + kaggle_username + '","key":"' + kaggle_api_key + '"}')
os.system("chmod 600 /root/.kaggle/kaggle.json")
# import kaggle
!kaggle datasets download -d rashikrahmanpritom/heart-attack-analysis-prediction-dataset
!unzip /content/heart-attack-analysis-prediction-dataset.zip
df = pd.read_csv("/content/heart.csv")
CATEGORICAL_COLUMNS = ['cp', 'restecg', 'slp', 'caa', 'thall']
RANDOM_STATE = 2137
TEST_SIZE = 0.33
for col in CATEGORICAL_COLUMNS:
one_hot = pd.get_dummies(df[col])
one_hot = one_hot.add_prefix(col + "_")
df = df.drop(col, axis=1)
df = pd.concat([df, one_hot], axis=1)
df.head()
X, y = df.drop('output', axis=1), df.output
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
model_rfc = RandomForestClassifier(n_estimators=150, random_state=RANDOM_STATE)
model_rfc.fit(X_train, y_train)
def pf_classifier_categorical(model, df):
df.loc[:, df.dtypes == 'object'] =\
df.select_dtypes(['object'])\
.apply(lambda x: x.astype('category'))
return model.predict_proba(df)[:, 1]
explainer_rf = dx.Explainer(model_rfc, X_test, y_test, predict_function=pf_classifier_categorical, label="RFC")
explainer_rf.model_performance()
explainer_rf.model_parts().result
NO_OBSERVATIONS = 4
model_rfc.predict(X_test[:NO_OBSERVATIONS])
cp = explainer_rf.predict_profile(new_observation=X_test.iloc[[3]])
cp.plot(variables=["age", "thalachh"])
cp_diff = explainer_rf.predict_profile(new_observation=X_test.iloc[[4, 8]])
cp_diff.plot(variables=['age', 'thalachh'])
pdp = explainer_rf.model_profile()
pdp.result
pdp.plot(variables=["age", "thalachh"])
pdp.plot(variables=["age", "thalachh"], geom="profiles", title="Partial Dependence Plot with individual profiles")